1 Notes

This report was generated on 2021-08-02 10:19:21. R version: 4.1.0 on x86_64-pc-linux-gnu. For this report, CRAN packages as of 2021-06-01 were used.

1.1 R-Script & data

The preprocessing and analysis of the data was conducted in the R project for statistical computing. The RMarkdown script used to generate this document and all the resulting data can be downloaded under this link. Through executing main.Rmd, the herein described process can be reproduced and this document can be generated. In the course of this, data from the folder input will be processed and results will be written to output. The html on-line version of the analysis can be accessed through this link.

1.2 GitHub

The code for the herein described process can also be freely downloaded from https://github.com/fernandomillanvillalobos/datavizR.

1.3 License

1.4 Data description of output files

1.4.0.1 abc.csv (Example)

Attribute Type Description
a Numeric
b Numeric
c Numeric

1.4.0.2 xyz.csv

2 Set up

## [1] "package package:rstudioapi detached"
## [1] "package package:knitr detached"

2.1 Define packages

# from https://mran.revolutionanalytics.com/web/packages/\
# checkpoint/vignettes/using-checkpoint-with-knitr.html
# if you don't need a package, remove it from here (commenting not sufficient)
# tidyverse: see https://blog.rstudio.org/2016/09/15/tidyverse-1-0-0/
cat("
library(rstudioapi)
library(tidyverse, warn.conflicts = FALSE) # ggplot2, dplyr, tidyr, readr, purrr, tibble, magrittr, readxl
library(scales) # scales for ggplot2
library(jsonlite) # json
library(lintr) # code linting
library(sf) # spatial data handling
library(rmarkdown)
library(data.table)
library(cowplot) # theme
library(extrafont)
library(waldo) # compare
library(psych) # some useful funs 
library(ggrepel) # text labels
library(skimr) # data quality 
library(gapminder) #data sets 
library(socviz) # book Data Visualization: A Practical...
library(RColorBrewer)
library(dichromat) # palettes for color-blind
library(ggridges) # density ridges plots
library(viridis) # colors
library(janitor)", # names
file = "manifest.R")

2.2 Install packages

# if checkpoint is not yet installed, install it (for people using this
# system for the first time)
if (!require(checkpoint)) {
  if (!require(devtools)) {
    install.packages("devtools", repos = "http://cran.us.r-project.org")
    require(devtools)
  }
  devtools::install_github("RevolutionAnalytics/checkpoint",
                           ref = "v0.3.2", # could be adapted later,
                           # as of now (beginning of July 2017
                           # this is the current release on CRAN)
                           repos = "http://cran.us.r-project.org")
  require(checkpoint)
}
# nolint start
if (!dir.exists("~/.checkpoint")) {
  dir.create("~/.checkpoint")
}
# nolint end
# install packages for the specified CRAN snapshot date
checkpoint(snapshot_date = package_date,
           project = path_to_wd,
           verbose = T,
           scanForPackages = T,
           use.knitr = F,
           R.version = r_version)
rm(package_date)

2.3 Load packages

source("manifest.R")
unlink("manifest.R")
sessionInfo()
## R version 4.1.0 (2021-05-18)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.2 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/liblapack.so.3
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] janitor_2.1.0      viridis_0.6.1      viridisLite_0.4.0  ggridges_0.5.3    
##  [5] dichromat_2.0-0    RColorBrewer_1.1-2 socviz_1.2         gapminder_0.3.0   
##  [9] skimr_2.1.3        ggrepel_0.9.1      psych_2.1.3        waldo_0.2.5       
## [13] extrafont_0.17     cowplot_1.1.1      data.table_1.14.0  rmarkdown_2.8     
## [17] sf_0.9-8           lintr_2.0.1        jsonlite_1.7.2     scales_1.1.1      
## [21] forcats_0.5.1      stringr_1.4.0      dplyr_1.0.6        purrr_0.3.4       
## [25] readr_1.4.0        tidyr_1.1.3        tibble_3.1.2       ggplot2_3.3.3     
## [29] tidyverse_1.3.1    rstudioapi_0.13    checkpoint_1.0.0  
## 
## loaded via a namespace (and not attached):
##  [1] nlme_3.1-152       fs_1.5.0           lubridate_1.7.10   httr_1.4.2        
##  [5] rprojroot_2.0.2    repr_1.1.3         tools_4.1.0        backports_1.2.1   
##  [9] bslib_0.2.5.1      utf8_1.2.1         R6_2.5.0           KernSmooth_2.23-20
## [13] DBI_1.1.1          lazyeval_0.2.2     colorspace_2.0-1   withr_2.4.2       
## [17] gridExtra_2.3      tidyselect_1.1.1   mnormt_2.0.2       processx_3.5.2    
## [21] compiler_4.1.0     extrafontdb_1.0    cli_2.5.0          rvest_1.0.0       
## [25] xml2_1.3.2         desc_1.3.0         sass_0.4.0         classInt_0.4-3    
## [29] callr_3.7.0        proxy_0.4-25       digest_0.6.27      base64enc_0.1-3   
## [33] pkgconfig_2.0.3    htmltools_0.5.1.1  dbplyr_2.1.1       rlang_0.4.11      
## [37] readxl_1.3.1       jquerylib_0.1.4    generics_0.1.0     magrittr_2.0.1    
## [41] Rcpp_1.0.6         munsell_0.5.0      fansi_0.5.0        lifecycle_1.0.0   
## [45] stringi_1.6.2      yaml_2.2.1         snakecase_0.11.0   plyr_1.8.6        
## [49] grid_4.1.0         parallel_4.1.0     crayon_1.4.1       lattice_0.20-44   
## [53] haven_2.4.1        hms_1.1.0          tmvnsim_1.0-2      knitr_1.33        
## [57] ps_1.6.0           pillar_1.6.1       reprex_2.0.0       glue_1.4.2        
## [61] evaluate_0.14      rex_1.2.0          remotes_2.3.0      modelr_0.1.8      
## [65] vctrs_0.3.8        Rttf2pt1_1.3.8     cellranger_1.1.0   gtable_0.3.0      
## [69] assertthat_0.2.1   xfun_0.23          broom_0.7.6        e1071_1.7-7       
## [73] cyclocomp_1.1.0    class_7.3-19       units_0.7-1        ellipsis_0.3.2

2.4 Load additional scripts

# if you want to outsource logic to other script files, see README for 
# further information
# Load all visualizations functions as separate scripts
knitr::read_chunk("scripts/dviz.supp.R")
source("scripts/dviz.supp.R")
knitr::read_chunk("scripts/themes.R")
source("scripts/themes.R")
knitr::read_chunk("scripts/plot_grid.R")
source("scripts/plot_grid.R")
knitr::read_chunk("scripts/align_legend.R")
source("scripts/align_legend.R")
knitr::read_chunk("scripts/label_log10.R")
source("scripts/label_log10.R")
knitr::read_chunk("scripts/outliers.R")
source("scripts/outliers.R")

3 Data Visualization: A Practical Introduction (Kieran Healy)

3.1 Show the Right Numbers

3.1.1 Grouped Data and the “Group” Aesthetic

The group aesthetic is usually only needed when the grouping information you need to tell ggplot about is not built into the variables being mapped.

p <- ggplot(data = gapminder,
            mapping = aes(x = year,
                          y = gdpPercap))
p + geom_line(aes(group=country))

3.1.2 Facet to Make Small Multiples

The facet_wrap() function can take a series of arguments, but the most important is the first one, which is specified using R’s “formula” syntax, which uses the tilde character, ~. Facets are usually a one-sided formula. Most of the time you will just want a single variable on the right side of the formula.

p <- ggplot(data = gapminder,
            mapping = aes(x = year,
                          y = gdpPercap))
p + geom_line(aes(group = country)) + facet_wrap(~ continent)

p <- ggplot(data = gapminder, mapping = aes(x = year, y = gdpPercap))
p + geom_line(color="gray70", aes(group = country)) +
    geom_smooth(size = 1.1, method = "loess", se = FALSE) +
    scale_y_log10(labels=scales::dollar) +
    facet_wrap(~ continent, ncol = 5) +
    labs(x = "Year",
         y = "GDP per capita",
         title = "GDP per capita on Five Continents")

The facet_wrap() function is best used when you want a series of small multiples based on a single categorical variable. Your panels will be laid out in order and then wrapped into a grid. If you wish you can specify the number of rows or the number of columns in the resulting layout. Facets can be more complex than this. For instance, you might want to cross-classify some data by two categorical variables. In that case you should try facet_grid() instead. This function will lay out your plot in a true two-dimensional arrangement, instead of a series of panels wrapped into a grid.

p <- ggplot(data = gss_sm,
            mapping = aes(x = age, y = childs))
p + geom_point(alpha = 0.2) +
    geom_smooth() +
    facet_grid(sex ~ race)

Multipanel layouts of this kind are especially effective when used to summarize continuous variation(as in a scatterplot) across two or more categorical variables, with the categories (and hence the panels) ordered in some sensible way.

3.1.3 Geoms Can Transform Data

Some geoms plot our data directly on the figure, as is the case with geom_point(), which takes variables designated as x and y and plots the points on a grid. But other geoms clearly do more work on the data before it gets plotted. Every geom_ function has an associated stat_ function that it uses by default. The reverse is also the case: every stat_ function has an associated geom_ function that it will plot by default if you ask it to. Sometimes the calculations being done by the stat_ functions that work together with the geom_ functions might not be immediately obvious. When ggplot calculates the count or the proportion, it returns temporary variables that we can use as mappings in our plots.

p <- ggplot(data = gss_sm, mapping = aes(x = bigregion)) 
p + geom_bar() # geom_bar called the default stat_ function associated with it, stat_count().

# We no longer have a count on the y-axis, but the proportions of the bars all have a value of 1, so all the bars are the same height. We want them to sum to 1, so that we get the number of observations per continent as a proportion of the total number of observations. This is a grouping issue again. In a sense, it’s the reverse of the earlier grouping problem we faced when we needed to tell ggplot that our yearly data was grouped by country.

p <- ggplot(data = gss_sm,
            mapping = aes(x = bigregion))
p + geom_bar(mapping = aes(y = ..prop..))

# In this case, we need to tell ggplot to ignore the x-categories when calculating denominator of the proportion, and use the total number observations instead. To do so we specify group = 1 inside the aes() call. The value of 1 is just a kind of “dummy group” that tells ggplot to use the whole dataset when establishing the denominator for its prop calculations.

p <- ggplot(data = gss_sm,
            mapping = aes(x = bigregion))
p + geom_bar(mapping = aes(y = ..prop.., group = 1)) # 1 is a dummy group

# Another example
p <- ggplot(data = gss_sm,
            mapping = aes(x = religion, fill = religion))
p + geom_bar() + guides(fill = FALSE) #  If we set guides(fill = FALSE), the legend is removed

3.1.4 Frequency Plots the Slightly Awkward Way

A more appropriate use of the fill aesthetic with geom_bar() is to cross-classify two categorical variables. This is the graphical equivalent of a frequency table of counts or proportions. When we cross-classify categories in bar charts, there are several ways to display the results. With geom_bar() the output is controlled by the position argument.

p <- ggplot(data = gss_sm,
            mapping = aes(x = bigregion, fill = religion))
p + geom_bar() # The default output of geom_bar() is a stacked bar chart

# An alternative choice is to set the position argument to "fill".
p <- ggplot(data = gss_sm,
            mapping = aes(x = bigregion, fill = religion))
p + geom_bar(position = "fill") # the bars are all the same height 

# When we just wanted the overall proportions for one variable, we mapped group = 1 to tell ggplot to calculate the proportions with respect to the overall N.
p <- ggplot(data = gss_sm,
            mapping = aes(x = bigregion, fill = religion))
p + geom_bar(position = "dodge",
             mapping = aes(y = ..prop.., group = religion))

# We can ask ggplot to give us a proportional bar chart of religious affiliation, and then facet that by region
p <- ggplot(data = gss_sm,
            mapping = aes(x = religion))
p + geom_bar(position = "dodge",
             mapping = aes(y = ..prop.., group = bigregion)) +
    facet_wrap(~ bigregion, ncol = 1)

3.1.5 Histograms and density plots

A histogram is a way of summarizing a continuous variable by chopping it up into segments or “bins” and counting how many observations are found within each bin. In a bar chart, the categories are given to us going in (e.g., regions of the country, or religious affiliation). With a histogram, we have to decide how finely to bin the data. As with the bar charts, a newly-calculated variable, count, appears on the x-axis.

While histograms summarize single variables, it’s also possible to use several at once to compare distributions. We can facet histograms by some variable of interest.

# By default, the geom_histogram() function will choose a bin size for us based on a rule of thumb.
p <- ggplot(data = midwest,
            mapping = aes(x = area))
p + geom_histogram()

# selecting another bin size
p <- ggplot(data = midwest,
            mapping = aes(x = area))
p + geom_histogram(bins = 10)

oh_wi <- c("OH", "WI")
# subset the data
p <- ggplot(data = subset(midwest, subset = state %in% oh_wi), # %in% operator is a convenient way to filter on more than one termin a variable
            mapping = aes(x = percollege, fill = state))
p + geom_histogram(alpha = 0.4, bins = 20)

# When working with a continuous variable, an alternative to binning the data and making a histogram is to calculate a kernel density estimate of the underlying distribution.
p <- ggplot(data = midwest,
            mapping = aes(x = area, fill = state, color = state))
p + geom_density(alpha = 0.3)

# For geom_density(), the stat_density() function can return its default ..density.. statistic, or ..scaled.., which will give a proportional density estimate. It can also return a statistic called ..count.., which is the density times the number of points. This can be used in stacked density plots.
p <- ggplot(data = subset(midwest, subset = state %in% oh_wi),
            mapping = aes(x = area, fill = state, color = state))
p + geom_density(alpha = 0.3, mapping = (aes(y = ..scaled..)))

3.1.6 Avoid transformations when necessary

Often our data is, in effect, already a summary table. This can happen when we have computed a table of marginal frequencies or percentages from the original data. Because we are working directly with percentage values in a summary table,we no longer have any need for ggplot to count up values for us or perform any other calculations. That is, we do not need the services of any stat_ functions. We can tell geom_bar() not to do any work on the variable before plotting it. To do this we say stat = ‘identity’ in the geom_bar() call.

p <- ggplot(data = titanic,
            mapping = aes(x = fate, y = percent, fill = sex))
p + geom_bar(position = "dodge", stat = "identity") + theme(legend.position = "top")

# For convenience ggplot also provides a related geom, geom_col(), which has exactly the same effect but assumes that stat = "identity".
# The position argument in geom_bar() and geom_col() can also take the value of "identity". Just as stat = "identity" means “don’t do any summary calculations”, position = "identity" means “just plot the values as given”.
p <- ggplot(data = oecd_sum,
            mapping = aes(x = year, y = diff, fill = hi_lo))
p + geom_col() + guides(fill = FALSE) +
  labs(x = NULL, y = "Difference in Years",
       title = "The US Life Expectancy Gap",
       subtitle = "Difference between US and OECD
                   average life expectancies, 1960-2015",
       caption = "Data: OECD. After a chart by Christopher Ingraham,
                  Washington Post, December 27th 2017.")

3.2 Graph Tables, Add Labels, Make Notes

3.2.1 Use Pipes to Summarize Data

letting the geoms (and their stat_ functions) do the work can sometimes get a little confusing. It is too easy to lose track of whether one has calculated row margins, column margins, or overall relative frequencies. A better strategy is to calculate the frequency table you want first and then plot that table. This has the benefit of allowing you do to some quick sanity checks on your tables, to make sure you haven’t made any errors.

In addition to making our code easier to read, it lets us more easily perform sanity checks on our results, so that we are sure we have grouped and summarized things in the right order.

rel_by_region <- gss_sm %>%
    group_by(bigregion, religion) %>% # from outermost to innermost 
    summarize(N = n()) %>%
    mutate(freq = N / sum(N), # calculate relative proportion 
           pct = round((freq*100), 0)) # calculate percentage

# Checking pct
rel_by_region %>% 
  group_by(bigregion) %>%
  summarize(total = sum(pct))
bigregion total
Northeast 100
Midwest 101
South 100
West 101
# As a rule, dodged charts can be more cleanly expressed as faceted plots. Faceting removes the need for a legend and thus makes the chart simpler to read.

p <- ggplot(rel_by_region, aes(x = religion, y = pct, fill = religion))
p + geom_col(position = "dodge2") +
    labs(x = NULL, y = "Percent", fill = "Religion") +
    guides(fill = FALSE) + 
    coord_flip() + # flip the axis 
    facet_grid(~ bigregion)

3.2.2 Continuous Variables by Group or Category

The variables specified in group_by() are retained in the new data frame, the variables created with summarize() are added, and all the other variables in the original data are dropped.

We generally want our plots to present data in some meaningful order. The reorder() function will do this for us. It takes two required arguments. The first is the categorical variable or factor that we want to reorder. In this case, that’s country. The second is the variable we want to reorder it by. Here that is the donation rate, donors. The third and optional argument to reorder() is the function you want to use as a summary statistic. If you give reorder() only the first two required arguments, then by default it will reorder the categories of your first variable by the mean value of the second. You can use any sensible function you like to reorder the categorical variable (e.g., median, or sd).

organdata %>% select(1:6) %>% sample_n(size = 10) # pick a sample 
country year donors pop pop_dens gdp
Germany 1996-01-01 12.70 81915 22.9434501 22164
Ireland 1994-01-01 20.30 3590 5.1088658 15990
France NA NA NA NA NA
Ireland 2002-01-01 21.00 3932 5.5955600 32571
Ireland 1996-01-01 16.80 3636 5.1743276 19245
Germany 1993-01-01 13.90 81156 22.7308630 19983
Netherlands 1998-01-01 13.00 15707 37.8208524 24780
France 1996-01-01 15.10 58026 10.5214869 21990
Canada 2002-01-01 13.00 31414 0.3150660 30429
Australia 1999-01-01 8.67 18926 0.2444834 25445
# dotplot
p <- ggplot(data = organdata, mapping = aes(x = year, y = donors)) 
p + geom_point()

# lineplot
p <- ggplot(data = organdata,
            mapping = aes(x = year, y = donors))
p + geom_line(aes(group = country)) + facet_wrap(~ country)

# boxplot
p <- ggplot(data = organdata,
            mapping = aes(x = country, y = donors))
p + geom_boxplot() +
  coord_flip()

# boxplot reordered
p <- ggplot(data = organdata,
            mapping = aes(x = reorder(country, donors, na.rm = TRUE),
                          y = donors))
p + geom_boxplot() +
    labs(x=NULL) +
    coord_flip()

# violin plot reordered and filled
p <- ggplot(data = organdata,
            mapping = aes(x = reorder(country, donors, na.rm=TRUE),
                          y = donors, fill = world))
p + geom_violin() + labs(x=NULL) +
    coord_flip() + theme(legend.position = "top")

# dotplot reordered and colored
p <- ggplot(data = organdata,
            mapping = aes(x = reorder(country, donors, na.rm=TRUE),
                          y = donors, color = world))
p + geom_point() + labs(x=NULL) +
    coord_flip() + theme(legend.position = "top")

# dotplot jittered, reordered and colored
p <- ggplot(data = organdata,
            mapping = aes(x = reorder(country, donors, na.rm=TRUE),
                          y = donors, color = world))
p + geom_jitter(position = position_jitter(width=0.15)) + # to avoid overplotting 
    labs(x=NULL) + coord_flip() + theme(legend.position = "top")

When we want to summarize a categorical variable that just has one point per category, we should use this approach as well. The result will be a Cleveland dotplot, a simple and extremely effective method of presenting data that is usually better than either a bar chart or a table. Cleveland dotplots are generally preferred to bar or column charts. When making them, put the categories on the y-axis and order them in the way that is most relevant to the numerical summary you are providing. This sort of plot is also an excellent way to summarizemodel results or any data with with error ranges.

by_country <- organdata %>% 
  group_by(consent_law, country) %>%
  summarize(donors_mean = mean(donors, na.rm = TRUE),
              donors_sd = sd(donors, na.rm = TRUE),
              gdp_mean = mean(gdp, na.rm = TRUE),
              health_mean = mean(health, na.rm = TRUE),
              roads_mean = mean(roads, na.rm = TRUE),
              cerebvas_mean = mean(cerebvas, na.rm = TRUE))

# Doing the same in another better way
by_country <- organdata %>% 
  group_by(consent_law, country) %>%
  summarize_if(is.numeric, list(mean, sd), na.rm = TRUE) %>% # list instead funs
  ungroup()
by_country # vars are named using the original variable, with the function’s name appended
consent_law country donors_fn1 pop_fn1 pop_dens_fn1 gdp_fn1 gdp_lag_fn1 health_fn1 health_lag_fn1 pubhealth_fn1 roads_fn1 cerebvas_fn1 assault_fn1 external_fn1 txp_pop_fn1 donors_fn2 pop_fn2 pop_dens_fn2 gdp_fn2 gdp_lag_fn2 health_fn2 health_lag_fn2 pubhealth_fn2 roads_fn2 cerebvas_fn2 assault_fn2 external_fn2 txp_pop_fn2
Informed Australia 10.63500 18317.923 0.2366284 22178.54 21779.43 1957.500 1848.214 5.676923 104.87573 557.6923 16.769231 393.0000 0.8751195 1.1428075 830.89394 0.0107334 3958.506 4085.883 481.6276 460.0962 0.4245661 14.327316 82.69863 1.8327505 26.76440 0.0396299
Informed Canada 13.96667 29607.923 0.2969520 23711.08 23353.07 2271.929 2163.429 6.676923 109.26011 422.3846 16.769231 410.6154 1.0485954 0.7511607 1192.74791 0.0119626 3965.847 4038.868 420.5751 379.1317 0.3919053 17.679258 38.46544 2.4547181 40.76873 0.0424701
Informed Denmark 13.09167 5257.154 12.2004034 23722.31 23275.00 2054.071 1973.429 6.984615 101.63635 640.6923 12.230769 532.3846 0.7610332 1.4681208 80.60691 0.1870664 3895.685 4100.016 371.3614 357.7605 0.1519109 12.421001 46.27163 2.1273554 33.60441 0.0116730
Informed Germany 13.04167 80254.846 22.4784601 22163.23 21938.36 2348.750 2256.250 8.142308 112.78873 706.7692 9.538462 391.3077 0.5508479 0.6111960 5157.63285 1.4445937 2501.344 2546.250 377.2275 383.7725 0.6244485 25.911094 126.03515 1.6641006 56.97424 0.0437534
Informed Ireland 19.79167 3673.615 5.2278574 20824.38 20153.64 1479.929 1340.786 4.876923 117.77424 704.6923 8.538462 394.0000 0.8175829 2.4784373 131.59378 0.1872688 6669.580 6881.862 565.5526 482.4379 0.2862221 10.761587 87.20320 1.5607362 18.85471 0.0287060
Informed Netherlands 13.65833 15547.692 37.4372557 23013.15 22553.64 1992.786 1884.857 5.700000 76.09357 584.9231 11.153846 285.8462 0.7078763 1.5518074 372.96434 0.8980600 3769.961 4009.418 417.0621 377.0521 0.3082207 9.930020 52.23259 1.3445045 18.62725 0.0169759
Informed United Kingdom 13.49167 58186.692 23.9540127 21359.31 20962.50 1561.214 1463.500 5.761539 67.92936 707.9231 8.923077 287.9231 0.7047037 0.7751344 626.34567 0.2578509 3929.497 4056.793 405.0679 374.9447 0.3548203 10.467402 93.43577 1.6563785 15.14079 0.0075729
Informed United States 19.98167 269329.769 2.7970428 29211.77 28699.43 3988.286 3760.429 5.776923 155.16783 444.3846 80.384615 530.0000 1.0193884 1.3253667 12544.86916 0.1302809 4571.160 4791.979 864.9320 807.5220 0.4621577 8.353810 16.04960 17.8724111 32.15587 0.0476772
Presumed Austria 23.52500 7927.308 9.4530261 23875.85 23415.07 1875.357 1803.143 5.492308 149.86541 768.8462 10.923077 506.8462 0.6308434 2.4159037 109.19507 0.1302112 3342.889 3645.228 296.8980 316.8549 0.2660249 30.281692 119.64242 2.3259958 62.13540 0.0088249
Presumed Belgium 21.90000 10153.308 30.6746456 22499.62 22095.93 1958.357 1862.429 6.188889 154.69504 593.8462 14.307692 541.6154 0.7880048 1.9357874 109.16378 0.3297999 3170.584 3400.119 405.1142 403.2977 0.2027588 20.556129 55.24920 3.6602508 22.87116 0.0084838
Presumed Finland 18.44167 5111.846 1.5117096 21018.92 20763.00 1615.286 1559.786 5.861538 93.57447 771.3846 27.461538 721.9231 0.5869704 1.5264089 68.62561 0.0202944 3667.866 3651.757 202.9780 181.5384 0.7274825 19.007381 136.47865 3.2304640 66.87234 0.0079285
Presumed France 16.75833 58055.692 10.5268708 22602.85 22210.71 2159.643 2066.429 7.076923 156.15327 432.6923 8.923077 602.6923 0.7063585 1.5974174 851.44929 0.1543879 3260.346 3459.035 397.2170 371.9650 0.2241794 20.063260 54.53345 1.6563785 46.63758 0.0103484
Presumed Italy 11.10000 57359.692 19.0348750 21554.15 21194.93 1757.000 1689.071 5.984615 121.94294 712.1538 14.923077 368.8462 0.4533029 4.2769998 424.68309 0.1409315 2781.309 2991.191 271.2379 264.0039 0.4469268 10.157891 118.03237 5.9646394 42.80936 0.0033601
Presumed Norway 15.44167 4386.231 1.3542765 26448.38 25769.36 2217.214 2125.429 6.830769 69.99821 661.6154 9.538462 423.3077 0.2280896 1.1090195 97.25752 0.0300289 6491.668 6734.625 606.2047 663.5244 0.3065524 6.676658 100.37890 2.4703368 43.53999 0.0050521
Presumed Spain 28.10833 39666.231 7.8393310 16933.00 16584.29 1289.071 1220.071 5.453846 161.11430 654.7692 8.692308 376.6154 0.7062535 4.9630376 950.90309 0.1879292 2888.343 3066.466 265.8960 269.3863 0.1450022 35.251103 138.65013 0.9473309 35.16281 0.0164238
Presumed Sweden 13.12500 8789.231 1.9533360 22415.46 22094.00 1951.357 1868.000 7.315385 72.34575 595.3077 11.153846 395.8462 0.6827600 1.7535030 113.62376 0.0252520 3213.468 3313.422 372.9790 329.3088 0.2303843 13.246920 49.68465 1.6756170 37.58733 0.0089211
Presumed Switzerland 14.18250 7036.846 17.0424949 27233.00 26931.29 2776.071 2655.643 5.423077 96.38543 423.5385 10.769231 488.2308 0.9953044 1.7090940 169.77330 0.4111729 2153.454 2356.923 475.6701 464.0117 0.6043645 21.701876 72.99956 3.5155333 96.19958 0.0242808
# Cleveland dotplot reordered and colored
p <- ggplot(data = by_country,
            mapping = aes(x = donors_fn1, y = reorder(country, donors_fn1),
                          color = consent_law))
p + geom_point(size = 3) +
    labs(x = "Donor Procurement Rate",
         y = "", color = "Consent Law") +
    theme(legend.position="top")

# Cleveland dotplot reordered, colored and faceted
p <- ggplot(data = by_country,
            mapping = aes(x = donors_fn1,
                          y = reorder(country, donors_fn1)))

p + geom_point(size=3) +
    facet_wrap(~ consent_law, scales = "free_y", ncol = 1) + # col arg to make panels appear on top of other and make y-scale free; where one axis is categorical, as here, we can free the categorical axis and leave the continuous one fixed 
    labs(x= "Donor Procurement Rate",
         y= "")

# Dot-and-whisker plot
p <- ggplot(data = by_country, mapping = aes(x = reorder(country,
              donors_fn1), y = donors_fn1))

p + geom_pointrange(mapping = aes(ymin = donors_fn1 - donors_fn2, # how us a point estimate and a range around it 
       ymax = donors_fn1 + donors_fn2)) +
     labs(x= "", y= "Donor Procurement Rate") + coord_flip()

3.2.3 Plot Text Directly

The ggrepel package provides geom_text_repel() and geom_label_repel(), two geoms that can pick out labels much more flexibly than the default geom_text(). The ggrepel package has several other useful geoms and options to aid with effectively plotting labels along with points. The performance of its labeling algorithm is consistently very good. For many purposes it will be a better first choice than geom_text().

elections_historic %>% select(2:7) 
year winner win_party ec_pct popular_pct popular_margin
1824 John Quincy Adams D.-R. 0.3218 0.3092 -0.1044
1828 Andrew Jackson Dem. 0.6820 0.5593 0.1225
1832 Andrew Jackson Dem. 0.7657 0.5474 0.1781
1836 Martin Van Buren Dem. 0.5782 0.5079 0.1420
1840 William Henry Harrison Whig 0.7959 0.5287 0.0605
1844 James Polk Dem. 0.6182 0.4954 0.0145
1848 Zachary Taylor Whig 0.5621 0.4728 0.0479
1852 Franklin Pierce Dem. 0.8581 0.5083 0.0695
1856 James Buchanan Dem. 0.5878 0.4529 0.1220
1860 Abraham Lincoln Rep. 0.5941 0.3965 0.1013
1864 Abraham Lincoln Rep. 0.9099 0.5503 0.1008
1868 Ulysses Grant Rep. 0.7279 0.5266 0.0532
1872 Ulysses Grant Rep. 0.8125 0.5558 0.1180
1876 Rutherford Hayes Rep. 0.5014 0.4792 -0.0300
1880 James Garfield Rep. 0.5799 0.4831 0.0009
1884 Grover Cleveland Dem. 0.5461 0.4885 0.0057
1888 Benjamin Harrison Rep. 0.5810 0.4780 -0.0830
1892 Grover Cleveland Dem. 0.6239 0.4602 0.0301
1896 William McKinley Rep. 0.6063 0.5102 0.0431
1900 William McKinley Rep. 0.6523 0.5164 0.0612
1904 Theodore Roosevelt Rep. 0.7059 0.5642 0.1883
1908 William Taft Rep. 0.6646 0.5157 0.0853
1912 Woodrow Wilson Dem. 0.8192 0.4184 0.1444
1916 Woodrow Wilson Dem. 0.5217 0.4924 0.0312
1920 Warren Harding Rep. 0.7608 0.6032 0.2617
1924 Calvin Coolidge Rep. 0.7194 0.5404 0.2522
1928 Herbert Hoover Rep. 0.8362 0.5821 0.1741
1932 Franklin Roosevelt Dem. 0.8889 0.5741 0.1776
1936 Franklin Roosevelt Dem. 0.9849 0.6080 0.2426
1940 Franklin Roosevelt Dem. 0.8456 0.5474 0.0996
1944 Franklin Roosevelt Dem. 0.8136 0.5339 0.0750
1948 Harry Truman Dem. 0.5706 0.4955 0.0448
1952 Dwight Eisenhower Rep. 0.8324 0.5518 0.1085
1956 Dwight Eisenhower Rep. 0.8606 0.5737 0.1540
1960 John Kennedy Dem. 0.5642 0.4972 0.0017
1964 Lyndon Johnson Dem. 0.9033 0.6105 0.2258
1968 Richard Nixon Rep. 0.5595 0.4342 0.0070
1972 Richard Nixon Rep. 0.9665 0.6067 0.2315
1976 Jimmy Carter Dem. 0.5520 0.5008 0.0206
1980 Ronald Reagan Rep. 0.9089 0.5075 0.0974
1984 Ronald Reagan Rep. 0.9758 0.5877 0.1821
1988 George H. W. Bush Rep. 0.7918 0.5337 0.0772
1992 Bill Clinton Dem. 0.6877 0.4301 0.0556
1996 Bill Clinton Dem. 0.7045 0.4923 0.0851
2000 George W. Bush Rep. 0.5037 0.4787 -0.0510
2004 George W. Bush Rep. 0.5316 0.5073 0.0246
2008 Barack Obama Dem. 0.6784 0.5293 0.0727
2012 Barack Obama Dem. 0.6171 0.5106 0.0386
2016 Donald Trump Rep. 0.5687 0.4625 -0.0175
p_title <- "Presidential Elections: Popular & Electoral College Margins"
p_subtitle <- "1824-2016"
p_caption <- "Data for 2016 are provisional."
x_label <- "Winner's share of Popular Vote"
y_label <- "Winner's share of Electoral College Votes"

p <- ggplot(elections_historic, aes(x = popular_pct, y = ec_pct,
                                    label = winner_label))

p + geom_hline(yintercept = 0.5, size = 1.4, color = "gray80") + # two new geoms, geom_hline() and geom_vline() to make the lines. see also geom_abline() geom that draws straight lines based on a supplied slope and intercept
    geom_vline(xintercept = 0.5, size = 1.4, color = "gray80") +
    geom_point() +
    geom_text_repel() +
    scale_x_continuous(labels = scales::percent) +
    scale_y_continuous(labels = scales::percent) +
    labs(x = x_label, y = y_label, title = p_title, subtitle = p_subtitle,
         caption = p_caption)

3.2.4 Label Outliers

Sometimes we want to pick out some points of interest in the data without labeling every single item. Alternatively, we can pick out specific points by creating a dummy variable in the data set just for this purpose.

p <- ggplot(data = by_country,
            mapping = aes(x = gdp_fn1, y = health_fn1))

# Using subset to filter the data
p + geom_point() +
    geom_text_repel(data = subset(by_country, gdp_fn1 > 25000),
                    mapping = aes(label = country))

p <- ggplot(data = by_country,
            mapping = aes(x = gdp_fn1, y = health_fn1))

p + geom_point() +
    geom_text_repel(data = subset(by_country,
                                  gdp_fn1 > 25000 | health_fn1 < 1500 |
                                  country %in% "Belgium"),
                    mapping = aes(label = country))

# Creating a dummy variable to subset the data
organdata$ind <- organdata$ccode %in% c("Ita", "Spa") &
                    organdata$year > 1998

p <- ggplot(data = organdata,
            mapping = aes(x = roads,
                          y = donors, color = ind))
p + geom_point() +
    geom_text_repel(data = subset(organdata, ind),
                    mapping = aes(label = ccode)) +
    guides(label = FALSE, color = FALSE)

3.2.5 Write and draw in the plot area

Sometimes we want to annotate the figure directly.We use annotate() for this purpose. We will tell annotate() to use a text geom temporarily taking advantage of their features in order to place something on the plot. The annotate() function can work with other geoms, too. The most obvious use-case is putting arbitrary text on the plot.

p <- ggplot(data = organdata, mapping = aes(x = roads, y = donors))
p + geom_point() + annotate(geom = "text", x = 91, y = 33,
                            label = "A surprisingly high \n recovery rate.",
                            hjust = 0)

3.2.6 Understanding Scales, Guides, and Themes

Learning about new geoms extended what we have seen already. Each geom makes a different type of plot. Different plots require different mappings in order to work, and so each geom_ function takes mappings tailored to the kind of graph it draws. You can’t use geom_point() to make a scatterplot without supplying an x and a y mapping, for example. Using geom_histogram() only requires you to supply an x mapping. Similarly, geom_pointrange() requires ymin and ymax mappings in order to know where to draw the lineranges it makes. A geom_ function may take optional arguments, too. When using geom_boxplot() you can specify what the outliers look like using arguments like outlier.shape and outlier.color.

Now we’ll make use of new functions for controlling some aspects of the appearance of our graph.

  • Every aesthetic mapping has a scale. If you want to adjust how that scale is marked or graduated, then you use a scale_ function.
  • Many scales come with a legend or key to help the reader interpret the graph. These are called guides. You can make adjustments to them with the guides() function. Perhaps the most common use case is to make the legend disappear, as it is sometimes superfluous. Another is to adjust the arrangement of the key in legends and colorbars.
  • Graphs have other features not strictly connected to the logical structure of the data being displayed. These include things like their background color, the typeface used for labels, or the placement of the legend on the graph. To adjust these, use the theme() function.

Consistent with ggplot’s overall approach, adjusting some visible feature of the graph means first thinking about the relationship that the feature has with the underlying data. Roughly speaking, if the change you want to make will affect the substantive interpretation of any particular geom, then most likely you will either be mapping an aesthetic to a variable using that geom’s aes() function, or you will be specifying a change via some scale_ function. If the change you want to make does not affect the interpretation of a given geom_, then most likely you will either be setting a variable inside the geom_ function, or making a cosmetic change via the theme() function.

p <- ggplot(data = organdata,
            mapping = aes(x = roads,
                          y = donors,
                          color = world))
p + geom_point()

Scales and guides are closely connected, which can make things confusing. The guide provides information about the scale, such as in a legend or colorbar. Thus, it is possible to make adjustments to guides from inside the various scale_ functions. More often it is easier to use the guides() function directly.

A plot with three aesthetic mappings. The variable roads is mapped to x; donors is mapped to y; and world is mapped to color. The x and y scales are both continuous, running smoothly from just under the lowest value of the variable to just over the highest value. Various labeled tick marks orient the reader to the values on each axis. The color mapping also has a scale. The world measure is an unordered categorical variable, so its scale is discrete. It takes one of four values, each represented by a different color.

Along with color, mappings like fill, shape, and size will have scales that we might want to customize or adjust. We could have mapped world to shape instead of color. In that case our four-category variable would have a scale consisting of four different shapes. Scales for these mappings may have labels, axis tick marks at particular positions, or specific colors or shapes. If we want to adjust them, we use one of the scale_ functions.

Many different kinds of variable can be mapped. More often than not x and y are continuous measures. But they might also easily be discrete, as when we mapped country names to the y axis in our boxplots and dotplots. An x or y mapping can also be defined as a transformation onto a log scale, or as a special sort of number value like a date. Similarly, a color or a fill mapping can be discrete and unordered, as with our world variable, or discrete and ordered, as with letter grades in an exam. A color or fill mapping can also be a continuous quantity, represented as a gradient running smoothly from a low to a high value. Finally, both continuous gradients and ordered discrete values might have some defined neutral midpoint with extremes diverging in both directions.

Because we have several potential mappings, and each mapping might be to one of several different scales, we end up with a lot of individual scale_ functions. Each deals with one combination of mapping and scale. They are named according to a consistent logic: *scale__*. First comes the scale_ name, then the mapping it applies to, and finally the kind of value the scale will display. Thus, the scale_x_continuous() function controls x scales for continuous variables; scale_y_discrete() adjusts y scales for discrete variables; and scale_x_log10() transforms an x mapping to a log scale. Most of the time, ggplot will guess correctly what sort of scale is needed for your mapping. Then it will work out some default features of the scale (such as its labels and where the tick marks go). In many cases you will not need to make any scale adjustments. If x is mapped to a continuous variable then adding + scale_x_continuous() to your plot statement with no further arguments will have no effect. It is already there implicitly. Adding + scale_x_log10(), on the other hand, will transform your scale, as now you have replaced the default treatment of a continuous x variable.

If you want to adjust the labels or tick marks on a scale, you will need to know which mapping it is for and what sort of scale it is. Then you supply the arguments to the appropriate scale function. For example, we can change the x-axis of the previous plot to a log scale, and then also change the position and labels of the tick marks on the y-axis.

p <- ggplot(data = organdata,
            mapping = aes(x = roads,
                          y = donors,
                          color = world))
p + geom_point() +
    scale_x_log10() +
    scale_y_continuous(breaks = c(5, 15, 25),
                       labels = c("Five", "Fifteen", "Twenty Five"))

The same applies to mappings like color and fill. Here the available scale_ functions include ones that deal with continuous, diverging, and discrete variables, as well as others that we will encounter later when we discuss the use of color and color palettes in more detail. When working with a scale that produces a legend, we can also use this its scale_ function to specify the labels in the key. To change the title of the legend, however, we use the labs() function, which lets us label all the mappings.

p <- ggplot(data = organdata,
            mapping = aes(x = roads,
                          y = donors,
                          color = world))
p + geom_point() +
    scale_color_discrete(labels =
                             c("Corporatist", "Liberal",
                               "Social Democratic", "Unclassified")) +
    labs(x = "Road Deaths",
         y = "Donor Procurement",
        color = "Welfare State")

If we want to move the legend somewhere else on the plot, we are making a purely cosmetic decision and that is the job of the theme() function. As we have already seen, adding + theme(legend.position = “top”) will move the legend as instructed. Finally, to make the legend disappear altogether, we tell ggplot that we do not want a guide for that scale.

We will use scale_ functions fairly regularly to make small adjustments to the labels and axes of our graphs. And we will occasionally use the theme() function to make some cosmetic adjustments.

p <- ggplot(data = organdata,
            mapping = aes(x = roads,
                          y = donors,
                          color = world))
p + geom_point() +
    labs(x = "Road Deaths",
         y = "Donor Procurement") +
    guides(color = FALSE)

3.3 Refine your plots

# Progressive enhancements of the same plot
# v1
p <- ggplot(data = subset(asasec, Year == 2014),
            mapping = aes(x = Members, y = Revenues, label = Sname))

p + geom_point() + geom_smooth()

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

# v2
p <- ggplot(data = subset(asasec, Year == 2014),
            mapping = aes(x = Members, y = Revenues, label = Sname))

p + geom_point(mapping = aes(color = Journal)) +
    geom_smooth(method = "lm")

# v3: 
p0 <- ggplot(data = subset(asasec, Year == 2014),
             mapping = aes(x = Members, y = Revenues, label = Sname))

p1 <- p0 + geom_smooth(method = "lm", se = FALSE, color = "gray80") +
    geom_point(mapping = aes(color = Journal)) 

# v4
p2 <- p1 + geom_text_repel(data=subset(asasec,
                                       Year == 2014 & Revenues > 7000),
                           size = 2)
p2

# v5
p3 <- p2 + labs(x="Membership",
        y="Revenues",
        color = "Section has own Journal",
        title = "ASA Sections",
        subtitle = "2014 Calendar year.",
        caption = "Source: ASA annual report.")
p4 <- p3 + scale_y_continuous(labels = scales::dollar) +
     theme(legend.position = "bottom")
p4

3.3.1 Use color to your advantage

You should choose a color palette in the first place based on its ability to express the data you are plotting. Take care to choose a palette that reflects the structure of your data. Separate from these mapping issues, there are considerations about which colors in particular to choose. In general, the default color palettes that ggplot makes available are well-chosen for their perceptual properties and aesthetic qualities. We can also use color and color layers as device for emphasis, to highlight particular data points or parts of the plot, perhaps in conjunction with other features.

We choose color palettes for mappings through one of the scale_ functions for color or fill. While it is possible to very finely control the look of your color schemes by varying the hue, chroma, and luminance of each color you use via scale_color_hue(), or scale_fill_hue(), in general this is not recommended. Instead you should use the RColorBrewer package to make a wide range of named color palettes available to you. When used in conjunction with ggplot, you access these colors by specifying the scale_color_brewer() or scale_fill_brewer() functions, depending on the aesthetic you are mapping.

You can also specify colors manually, via scale_color_manual() or scale_fill_manual(). These functions take a value argument that can be specified as vector of color names or color values that R knows about. The ability to manually specify colors can be useful when the meaning of a category itself has a strong color association. R knows many color names (like red, and green, and cornflowerblue. Try demo(‘colors’) for an overview. Alternatively, color values can be specified via their hexadecimal RGB value. This is a way of encoding color values in the RGB colorspace, where each channel can take a value from 0 to 255 like this. A color hex value begins with a hash or pound character, #, followed by three pairs of hexadecimal or “hex” numbers. Hex values are in Base 16, with the first six letters of the alphabet standing for the numbers 10 to 15. This allows a two-character hex number to range from 0 to 255. You read them as #rrggbb, where rr is the two-digit hex code for the red channel, gg for the green channel, and bb for the blue channel. So #CC55DD translates in decimal to CC = 204 (red), 55 = 85 (green), and DD = 221 (blue). It gives a strong pink color.

If we are serious about using a safe palette for color-blind viewers, we should investigate the dichromat package (The colorblindr package has similar functionality) instead. It provides a range of safe palettes and some useful functions for helping you approximately see what your current palette might look like to a viewer with one of several different kinds of color blindness.

p <- ggplot(data = organdata,
            mapping = aes(x = roads, y = donors, color = world))
p + geom_point(size = 2) + scale_color_brewer(palette = "Set2") +
    theme(legend.position = "top")

p + geom_point(size = 2) + scale_color_brewer(palette = "Pastel2") +
        theme(legend.position = "top")

p + geom_point(size = 2) + scale_color_brewer(palette = "Dark2") +
    theme(legend.position = "top")

# Defining your own palette
cb_palette <- c("#999999", "#E69F00", "#56B4E9", "#009E73",
                "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

p4 + scale_color_manual(values = cb_palette)

# Setting default color palette
Default <- brewer.pal(5, "Set2")

# safety colors from dichromat 
types <- c("deutan", "protan", "tritan")
names(types) <- c("Deuteronopia", "Protanopia", "Tritanopia")

color_table <- types %>%
    purrr::map(~ dichromat(Default, .x)) %>%
    as_tibble() %>%
    add_column(Default, .before = TRUE)

color_table
Default Deuteronopia Protanopia Tritanopia
#66C2A5 #AEAEA7 #BABAA5 #82BDBD
#FC8D62 #B6B661 #9E9E63 #F29494
#8DA0CB #9C9CCB #9E9ECB #92ABAB
#E78AC3 #ACACC1 #9898C3 #DA9C9C
#A6D854 #CACA5E #D3D355 #B6C8C8
color_comp(color_table)

3.3.2 Layer color and text together

Aside from mapping variables directly, color is also very useful when we want to pick out or highlight some aspect of our data. In cases like this that the layered approach of ggplot can really work to our advantage.

We will build up a plot of data about the 2016 US general election. It is contained in the county_data object in the socviz library. We begin by defining a blue and red color for the Democrats and Republicans. Then we create the basic setup and first layer of the plot. We subset the data, including only counties with a value of “No” on the flipped variable. We set the color of geom_point() to be a light gray, as it will form the background layer of the plot. And we apply a log transformation to the x-axis scale.

In the next step we add a second geom_point() layer. Here we start with the same dataset but extract a complementary subset from it. This time we choose the “Yes” counties on the flipped variable. The x and y mappings are the same, but we add a color scale for these points, mapping the partywinner16 variable to the color aesthetic. Then we specify a manual color scale with scale_color_manual(), where the values are the blue and red party_colors we defined above.

The next layer sets the y-axis scale and the labels.

Finally, we add a third layer using the geom_text_repel() function. Once again we supply a set of instructions to subset the data for this text layer. We are interested in the flipped counties that have with a relatively high percentage of African-American residents.

# Democrat Blue and Republican Red
party_colors <- c("#2E74C0", "#CB454A")

p0 <- ggplot(data = subset(county_data,
                           flipped == "No"),
             mapping = aes(x = pop,
                           y = black/100))

p1 <- p0 + geom_point(alpha = 0.15, color = "gray50") +
    scale_x_log10(labels=scales::comma) 

p1

p2 <- p1 + geom_point(data = subset(county_data,
                                    flipped == "Yes"),
                      mapping = aes(x = pop, y = black/100,
                                    color = partywinner16)) +
    scale_color_manual(values = party_colors)

p2

p3 <- p2 + scale_y_continuous(labels=scales::percent) +
    labs(color = "County flipped to ... ",
         x = "County Population (log scale)",
         y = "Percent Black Population",
         title = "Flipped counties, 2016",
         caption = "Counties in gray did not flip.")

p3

p4 <- p3 + geom_text_repel(data = subset(county_data,
                                      flipped == "Yes" &
                                      black  > 25),
                           mapping = aes(x = pop,
                                   y = black/100,
                                   label = state), size = 2)

p4 + theme_minimal() +
    theme(legend.position="top")

3.3.3 Change the appearance of plots with themes

If we want to change the overall look of it all at once, we can do that using ggplot’s theme engine. Themes can be turned on or off using the theme_set() function. It takes the name of a theme (which will itself be a function) as an argument.

Internally, theme functions are a set of detailed instructions to turn on, turn off, or modify a large number of graphical elements on the plot. Once set, a theme applies to all subsequent plots and it remains active until it is replaced by a different theme. This be done either through the use of another theme_set() statement, or on a per-plot basis by adding the theme function to the end of the plot: p4 + theme_gray() would temporarily override the generally active theme for the p4 object only. You can still use the theme() function to fine-tune any aspect of your plot, as seen above with the relocation of the legend to the top of the graph.

The ggplot library comes with several built-in themes, including theme_minimal() and theme_classic(), with theme_gray() or theme_grey() as the default. If these are not to your taste, install the ggthemes library for many more options.

You can define your own themes either entirely from scratch, or by starting with one you like and making adjustments from there.

Wilke’s cowplot package, for instance, contains a well-developed theme suitable for figures whose final destination is a journal article. Bob Rudis’s hrbrthemes package, meanwhile, has a distinctive and compact look and feel that takes advantage of some freely-available typefaces.

The theme() function allows you to exert very fine-grained control over the appearance of all kinds of text and graphical elements in a plot.

# theme_set(theme_bw())
# p4 + theme(legend.position="top")
# 
# theme_set(theme_dark())
# p4 + theme(legend.position="top")
# 
# theme_set(theme_economist())
# p4 + theme(legend.position="top")

# theme_set(theme_wsj())

p4 + theme(plot.title = element_text(size = rel(0.6)),
           legend.title = element_text(size = rel(0.35)),
           plot.caption = element_text(size = rel(0.35)),
           legend.position = "top")

p4 + theme(legend.position = "top")

p4 + theme(legend.position = "top",
           plot.title = element_text(size=rel(2),
                                     lineheight=.5,
                                     family="Times",
                                     face="bold.italic",
                                     colour="orange"),
           axis.text.x = element_text(size=rel(1.1),
                                      family="Courier",
                                      face="bold",
                                      color="purple"))

### Use Theme Elements in a Substantive Way

The gss_lon data contains information on the age of each GSS respondent for all the years in the survey since 1972. We will fill the density curves with a dark grey color, and then add an indicator of the mean age in each year, and a text layer for the label. With those in place we then adjust the detail of several theme elements, mostly to remove them. As before we use element_text() to tweak the appearance of various text elements such as titles and labels. And we also use element_blank() to remove several of them altogether. First, we need to calculate the mean age of the respondents for each year of interest. Because the GSS has been around for most (but not all) years since 1972, we will look at distributions about every four years since the beginning.

The initial p object subsets the data by the years we have chosen, and maps x to the age variable. The geom_density() call is the base layer, with arguments to turn off its default line color, set the fill to a shade of gray, and scale the y-axis between zero and one. Using our summarized dataset, the geom_vline() layer draws a vertical white line at the mean age of the distribution.

The ggridges package offers a different take on small-multiple density plots by allowing the distributions to overlap vertically to interesting effect. It is especially useful for repeated distributional measures that change in a clear direction. The expand argument in scale_y_discrete() adjusts the scaling of the y-axis slightly. The package also comes with its own theme, theme_ridges() that adjusts the labels so that they are aligned properly. The degree of overlap in the distributions is controlled by the scale argument in the geom.

Setting these thematic elements in an ad hoc way is often one of the first things people want to do when they make plot. But making small adjustments to theme elements should be the very last thing you do in the plotting process. Ideally, once you have set up a theme that works well for you, it should be something you can avoid having to do at all.

yrs <- c(seq(1972, 1988, 4), 1993, seq(1996, 2016, 4))
yrs
##  [1] 1972 1976 1980 1984 1988 1993 1996 2000 2004 2008 2012 2016
mean_age <- gss_lon %>%
    filter(age %nin% NA && year %in% yrs) %>%
    group_by(year) %>%
    summarize(xbar = round(mean(age, na.rm = TRUE), 0))
mean_age
year xbar
1972 45
1973 44
1974 45
1975 44
1976 45
1977 45
1978 44
1980 45
1982 45
1983 44
1984 44
1985 46
1986 45
1987 45
1988 45
1989 45
1990 46
1991 46
1993 46
1994 46
1996 45
1998 46
2000 46
2002 46
2004 46
2006 47
2008 48
2010 48
2012 48
2014 49
2016 49
mean_age$y <- 0.3

yr_labs <- data.frame(x = 85, y = 0.8,
                      year = yrs)

# First, we create the plot structure
p <- ggplot(data = subset(gss_lon, year %in% yrs),
            mapping = aes(x = age))

p1 <- p + geom_density(fill = "gray20", color = FALSE,
                       alpha = 0.9, mapping = aes(y = ..scaled..)) +
    geom_vline(data = subset(mean_age, year %in% yrs),
               aes(xintercept = xbar), color = "white", size = 0.5) +
    geom_text(data = subset(mean_age, year %in% yrs),
              aes(x = xbar, y = y, label = xbar), nudge_x = 7.5,
              color = "white", size = 3.5, hjust = 1) +
    geom_text(data = subset(yr_labs, year %in% yrs),
              aes(x = x, y = y, label = year)) +
    facet_grid(year ~ ., switch = "y")

# With the structure of the plot in place, we then style the elements in the way that we want, using a series of instructions to theme().
# p1 + theme_book(base_size = 10, plot_title_size = 10,
#                 strip_text_size = 32, panel_spacing = unit(0.1, "lines")) +
#     theme(plot.title = element_text(size = 16),
#           axis.text.x= element_text(size = 12),
#           axis.title.y=element_blank(),
#           axis.text.y=element_blank(),
#           axis.ticks.y = element_blank(),
#           strip.background = element_blank(),
#           strip.text.y = element_blank(),
#           panel.grid.major = element_blank(),
#           panel.grid.minor = element_blank()) +
#     labs(x = "Age",
#          y = NULL,
#          title = "Age Distribution of\nGSS Respondents")

# Using the ggridges package
p <- ggplot(data = gss_lon,
            mapping = aes(x = age, y = factor(year, levels = rev(unique(year)),
                                     ordered = TRUE)))

p + geom_density_ridges(alpha = 0.6, fill = "lightblue", scale = 1.5) +
    scale_x_continuous(breaks = c(25, 50, 75)) +
    scale_y_discrete(expand = c(0.01, 0)) + 
    labs(x = "Age", y = NULL,
         title = "Age Distribution of\nGSS Respondents") +
    theme_ridges() +
    theme(title = element_text(size = 16, face = "bold"))

### Case Studies #### Two y-axes R makes it slightly tricky to draw graphs with two y-axes. In fact, ggplot rules it out of order altogether. It is possible to do it using R’s base graphics. Most of the time when people draw plots with two y-axes they want to line the series up as closely as possible because they suspect that there’s a substantive association between them. The main problem with using two y-axes is that it makes it even easier than usual to fool yourself (or someone else) about the degree of association between the variables. This is because you can adjust the scaling of the axes to relative to one another in way that moves the data series around more or less however you like.

We could use a split- or broken-axis plot to show the two series at the same time. These can be effective sometimes, and they seem to have better perceptual properties than overlayed charts with dual axes. Another compromise, if the series are not in the same units (or of widely differing magnitudes), is to rescale one of the series (e.g., by dividing or multiplying it by a thousand), or alternatively to index each of them to 100 at the start of the first period, and then plot them both. Index numbers can have complications of their own, but here they allow us use one axis instead of two, and also to calculate a sensible difference between the two series and plot that as well.

Now we have our two plots, we want to lay them out nicely. We do not want them to appear in the same plot area, but we do want to compare them. It would be possible to do this with a facet, but that would mean doing a fair amount of data munging to get all three series (the two indices and the difference between them) into the same tidy data frame. An alternative is to make two separate plots and then arrange them just as we like. The cowplot library makes things easy. It has a plot_grid() function that works much like grid.arrange() while also taking care of some fine details, including the proper alignment of axes across separate plot objects.

The broader problem with dual-axis plots of this sort is that the apparent association between these variables is probably spurious. The original plot is enabling our desire to spot patterns, but substantively it is probably the case that both of these time series are tending to increase, but are not otherwise related in any deep way. The use of dual axes is not recommended in general because is already much too easy to present spurious, or at least overconfident, associations, especially with time series data. Scatterplots can do that just fine. Even with a single series, we can make associations look steeper or flatter by fiddling with the aspect ratio. Using two y-axes gives you an extra degree of freedom to mess about with the data.

# Tidying data
head(fredts)
date sp500 monbase sp500_i monbase_i
2009-03-11 696.68 1542228 100.0000 100.0000
2009-03-18 766.73 1693133 110.0548 109.7849
2009-03-25 799.10 1693133 114.7012 109.7849
2009-04-01 809.06 1733017 116.1308 112.3710
2009-04-08 830.61 1733017 119.2240 112.3710
2009-04-15 852.21 1789878 122.3245 116.0579
fredts_m <- fredts %>% select(date, sp500_i, monbase_i) %>%
    gather(key = series, value = score, sp500_i:monbase_i)

head(fredts_m)
date series score
2009-03-11 sp500_i 100.0000
2009-03-18 sp500_i 110.0548
2009-03-25 sp500_i 114.7012
2009-04-01 sp500_i 116.1308
2009-04-08 sp500_i 119.2240
2009-04-15 sp500_i 122.3245
# Plotting
p <- ggplot(data = fredts_m,
            mapping = aes(x = date, y = score,
                          group = series,
                          color = series))
p1 <- p + geom_line() + theme(legend.position = "top") +
    labs(x = "Date",
         y = "Index",
         color = "Series")

p <- ggplot(data = fredts,
            mapping = aes(x = date, y = sp500_i - monbase_i))

p2 <- p + geom_line() +
    labs(x = "Date",
         y = "Difference")
cowplot::plot_grid(p1, p2, nrow = 2, rel_heights = c(0.75, 0.25), align = "v") # arrange the plots 

#### Redrawing a bad slide To redraw the chart I took the numbers from the bars on the chart together with employee data from QZ.com. Where there was quarterly data in the slide, I used the end-of-year number for employees, except for 2012. Mayer was appointed in July of 2012. Ideally we would have quarterly revenue and quarterly employee data for all years, but given that we do not, the most sensible thing to do is to keep things annualized except for the one year of interest, when Mayer arrives as CEO. It’s worth doing this because otherwise the large round of layoffs that immediately preceded her arrival would be misattributed to her tenure as CEO. The redrawing is straightforward. We could just draw a scatterplot and color the points by whether Mayer was CEO at the time. We can take a small step further by making a scatterplot but also holding on to the temporal element. We can use geom_path() and use use line segments to “join the dots” of the yearly observations in order, labeling each point with its year.

Alternatively, we can keep the analyst community happy by putting time back on the x-axis and plotting the ratio of revenue to employees on the y-axis.

headTail(yahoo)
Year Revenue Employees Mayer
2004 3574 7600 No
2005 5257 9800 No
2006 6425 11400 No
2007 6969 14300 No
NA
2012 4986 12000 No
2012 4986 11500 Yes
2013 4680 12200 Yes
2014 4618 12500 Yes
p <- ggplot(data = yahoo,
            mapping = aes(x = Employees, y = Revenue))
p + geom_path(color = "gray80") +
    geom_text(aes(color = Mayer, label = Year), # highlight points of interest 
              size = 3, fontface = "bold") +
    theme(legend.position = "bottom") +
    labs(color = "Mayer is CEO",
         x = "Employees", y = "Revenue (Millions)",
         title = "Yahoo Employees vs Revenues, 2004-2014") +
    scale_y_continuous(labels = scales::dollar) +
    scale_x_continuous(labels = scales::comma)

# Alternative version
p <- ggplot(data = yahoo,
            mapping = aes(x = Year, y = Revenue/Employees))

p + geom_vline(xintercept = 2012) +
    geom_line(color = "gray60", size = 2) +
    annotate("text", x = 2013, y = 0.44,
             label = " Mayer becomes CEO", size = 2.5) +
    labs(x = "Year\n",
         y = "Revenue/Employees",
         title = "Yahoo Revenue to Employee Ratio, 2004-2014")

#### Saying no to pie

There is a reasonable amount of customization in this graph. First, the text of the facets is made bold in the theme() call. The graphical element is first named (strip.text.x) and then modified using the element_text() function. We also use a custom palette for the fill mapping, via scale_fill_brewer(). And finally we relabel the facets to something more informative than their bare variable names. This is done using the labeller argument and the as_labeller() function inside the facet_grid() call. At the beginning of the plotting code, we set up an object called f_labs, which is in effect a tiny data frame that associates new labels with the values of the type variable in studebt. We use backticks (the angled quote character located next to the ‘1’ key on US keyboards) to pick out the values we want to relabel. The as_labeller() function takes this object and uses it to create new text for the labels when facet_grid() is called.

When the categorical axis labels are long, though, I generally find it’s easier to read them on the y-axis. The colors on the graph are not encoding or mapping any information in the data that is not already taken care of by the faceting. The fill mapping is useful, but also redundant. This graph could easily be in black and white, and would be just as informative if it were.

One thing that is not emphasized in a faceted chart like this is the idea that each of the debt categories is a share or percentage of a total amount.

Instead of having separate bars distinguished by heights, we can array the percentages for each distribution proportionally within a single bar. We will make a stacked bar chart. We are careful to map the income categories in an ascending sequence of colors, and to adjust the key so that the values run from low to high, from left to right, and from yellow to purple. This is done partly by switching the fill mapping from Debt to Debtrc. The categories of the latter are the same as the former, but the sequence of income levels is coded in the order we want.

The rest of the work is done in the guides() call. We give guides() a series of instructions about the fill mapping: reverse the direction of the color coding; put the legend title above the key; put the labels for the colors below the key; widen the width of the color boxes a little, and place the whole key on a single row.

head(studebt)
Debt type pct Debtrc
Under $5 Borrowers 20 Under $5
$5-$10 Borrowers 17 $5-$10
$10-$25 Borrowers 28 $10-$25
$25-$50 Borrowers 19 $25-$50
$50-$75 Borrowers 8 $50-$75
$75-$100 Borrowers 3 $75-$100
# setting up some labels in advance, as we will reuse them
p_xlab <- "Amount Owed, in thousands of Dollars"
p_title <- "Outstanding Student Loans"
p_subtitle <- "44 million borrowers owe a total of $1.3 trillion"
p_caption <- "Source: FRB NY"

# a special label for the facets
f_labs <- c(`Borrowers` = "Percent of\nall Borrowers",
            `Balances` = "Percent of\nall Balances")

p <- ggplot(data = studebt,
            mapping = aes(x = Debt, y = pct/100, fill = type))
p + geom_bar(stat = "identity") +
    scale_fill_brewer(type = "qual", palette = "Dark2") +
    scale_y_continuous(labels = scales::percent) +
    guides(fill = FALSE) +
    theme(strip.text.x = element_text(face = "bold")) +
    labs(y = NULL, x = p_xlab,
      caption = p_caption,
      title = p_title,
      subtitle = p_subtitle) +
    facet_grid(~ type, labeller = as_labeller(f_labs)) +
    coord_flip()

# stacked bar chart
p <- ggplot(studebt, aes(y = pct/100, x = type, fill = Debtrc)) # pct/100 to plot as pct 
p + geom_bar(stat = "identity", color = "gray80") + # we set the border colors of the bars to a light gray in geom_bar() to make the bar segments easier to distinguish. 
  scale_x_discrete(labels = as_labeller(f_labs)) +
  scale_y_continuous(labels = scales::percent) +
  scale_fill_viridis(discrete = TRUE) + # using scale_fill_viridis() for the color palette 
  guides(fill = guide_legend(reverse = TRUE,
                             title.position = "top",
                             label.position = "bottom",
                             keywidth = 3,
                             nrow = 1)) +
  labs(x = NULL, y = NULL,
       fill = "Amount Owed, in thousands of dollars",
       caption = p_caption,
       title = p_title,
       subtitle = p_subtitle) +
  theme(legend.position = "top",
        axis.text.y = element_text(face = "bold", hjust = 1, size = 12),
        axis.ticks.length = unit(0, "cm"),
        panel.grid.major.y = element_blank()) +
  coord_flip()

4 Linting

The code in this RMarkdown is linted with the lintr package, which is based on the tidyverse style guide.

# lintr::lint("main.Rmd", linters =
#               lintr::with_defaults(
#                 commented_code_linter = NULL,
#                 trailing_whitespace_linter = NULL
#                 )
#             )
# # if you have additional scripts and want them to be linted too, add them here
# lintr::lint("scripts/my_script.R")